Libraries
library(ggplot2)
library(dplyr)
library(png)
library(grid)
library(plotly)
library(gganimate)
library(tidyr)
library(forecast)
Load data
load_csv <- function(csv_name, data_folder = 'data') {
df <- read.csv(file.path(data_folder, csv_name))
df
}
inventories_df <- load_csv('inventories.csv')
inventory_sets_df <- load_csv('inventory_sets.csv')
sets_df <- load_csv('sets.csv')
themes_df <- load_csv('themes.csv')
inventory_minifigs_df <- load_csv('inventory_minifigs.csv')
minifigs_df <- load_csv('minifigs.csv')
inventory_parts_df <- load_csv('inventory_parts.csv')
colors_df <- load_csv('colors.csv')
parts_df <- load_csv('parts.csv')
elements_df <- load_csv('elements.csv')
part_categories_df <- load_csv('part_categories.csv')
part_relationships_df <- load_csv('part_relationships.csv')
Basic tables analysis
Data schema

Tables
Inventories
| 1 |
1 |
7922-1 |
| 3 |
1 |
3931-1 |
| 4 |
1 |
6942-1 |
| 15 |
1 |
5158-1 |
| 16 |
1 |
903-1 |
| 17 |
1 |
850950-1 |
## id version set_num
## Min. : 1 Min. : 1.000 Length:37265
## 1st Qu.: 14424 1st Qu.: 1.000 Class :character
## Median : 54379 Median : 1.000 Mode :character
## Mean : 61104 Mean : 1.091
## 3rd Qu.: 88842 3rd Qu.: 1.000
## Max. :194312 Max. :16.000
Inventory sets
| 35 |
75911-1 |
1 |
| 35 |
75912-1 |
1 |
| 39 |
75048-1 |
1 |
| 39 |
75053-1 |
1 |
| 50 |
4515-1 |
1 |
| 50 |
4520-1 |
2 |
## id version set_num
## Min. : 1 Min. : 1.000 Length:37265
## 1st Qu.: 14424 1st Qu.: 1.000 Class :character
## Median : 54379 Median : 1.000 Mode :character
## Mean : 61104 Mean : 1.091
## 3rd Qu.: 88842 3rd Qu.: 1.000
## Max. :194312 Max. :16.000
Sets
## set_num name year theme_id
## Length:21880 Length:21880 Min. :1949 Min. : 1
## Class :character Class :character 1st Qu.:2001 1st Qu.:273
## Mode :character Mode :character Median :2012 Median :497
## Mean :2008 Mean :442
## 3rd Qu.:2018 3rd Qu.:608
## Max. :2024 Max. :752
## num_parts img_url
## Min. : 0.0 Length:21880
## 1st Qu.: 3.0 Class :character
## Median : 31.0 Mode :character
## Mean : 161.4
## 3rd Qu.: 139.0
## Max. :11695.0
Themes
| 1 |
Technic |
NA |
| 3 |
Competition |
1 |
| 4 |
Expert Builder |
1 |
| 16 |
RoboRiders |
1 |
| 17 |
Speed Slammers |
1 |
| 18 |
Star Wars |
1 |
## id name parent_id
## Min. : 1.0 Length:468 Min. : 1.0
## 1st Qu.:250.5 Class :character 1st Qu.:186.0
## Median :466.0 Mode :character Median :411.0
## Mean :433.5 Mean :360.6
## 3rd Qu.:625.2 3rd Qu.:512.5
## Max. :752.0 Max. :697.0
## NA's :145
Inventory Minifigs
| 3 |
fig-001549 |
1 |
| 4 |
fig-000764 |
1 |
| 19 |
fig-000555 |
1 |
| 25 |
fig-000574 |
1 |
| 26 |
fig-000842 |
1 |
| 26 |
fig-008641 |
1 |
## inventory_id fig_num quantity
## Min. : 3 Length:20858 Min. : 1.000
## 1st Qu.: 7869 Class :character 1st Qu.: 1.000
## Median : 15681 Mode :character Median : 1.000
## Mean : 43010 Mean : 1.062
## 3rd Qu.: 66834 3rd Qu.: 1.000
## Max. :194312 Max. :100.000
Minifigs
## fig_num name num_parts img_url
## Length:13764 Length:13764 Min. : 0.000 Length:13764
## Class :character Class :character 1st Qu.: 4.000 Class :character
## Mode :character Mode :character Median : 4.000 Mode :character
## Mean : 5.296
## 3rd Qu.: 5.000
## Max. :156.000
Inventory parts
## inventory_id part_num color_id quantity
## Min. : 1 Length:1180987 Min. : -1.0 Min. : 1.00
## 1st Qu.: 9404 Class :character 1st Qu.: 4.0 1st Qu.: 1.00
## Median : 22838 Mode :character Median : 15.0 Median : 2.00
## Mean : 50849 Mean : 131.8 Mean : 3.37
## 3rd Qu.: 87088 3rd Qu.: 71.0 3rd Qu.: 4.00
## Max. :194312 Max. :9999.0 Max. :3064.00
## is_spare img_url
## Length:1180987 Length:1180987
## Class :character Class :character
## Mode :character Mode :character
##
##
##
Colors
| -1 |
[Unknown] |
0033B2 |
f |
| 0 |
Black |
05131D |
f |
| 1 |
Blue |
0055BF |
f |
| 2 |
Green |
237841 |
f |
| 3 |
Dark Turquoise |
008F9B |
f |
| 4 |
Red |
C91A09 |
f |
## id name rgb is_trans
## Min. : -1.0 Length:263 Length:263 Length:263
## 1st Qu.: 83.0 Class :character Class :character Class :character
## Median :1005.0 Mode :character Mode :character Mode :character
## Mean : 651.4
## 3rd Qu.:1070.5
## Max. :9999.0
Parts
| 003381 |
Sticker Sheet for Set 663-1 |
58 |
Plastic |
| 003383 |
Sticker Sheet for Sets 618-1, 628-2 |
58 |
Plastic |
| 003402 |
Sticker Sheet for Sets 310-3, 311-1, 312-3 |
58 |
Plastic |
| 003429 |
Sticker Sheet for Set 1550-1 |
58 |
Plastic |
| 003432 |
Sticker Sheet for Sets 357-1, 355-1, 940-1 |
58 |
Plastic |
| 003434 |
Sticker Sheet for Set 575-2, 653-1, 460-1 |
58 |
Plastic |
## part_num name part_cat_id part_material
## Length:52615 Length:52615 Min. : 1.00 Length:52615
## Class :character Class :character 1st Qu.:17.00 Class :character
## Mode :character Mode :character Median :41.00 Mode :character
## Mean :38.91
## 3rd Qu.:60.00
## Max. :68.00
Elements
| 6443403 |
2277c01pr0009 |
1 |
2277 |
| 6300211 |
67906c01 |
14 |
67908 |
| 4566309 |
2564 |
0 |
2564 |
| 4275423 |
53657 |
1004 |
53657 |
| 6194308 |
92926 |
71 |
28967 |
| 6229123 |
26561 |
4 |
26561 |
## element_id part_num color_id design_id
## Min. : 9327 Length:84138 Min. : -1.0 Min. : 1001
## 1st Qu.: 4259774 Class :character 1st Qu.: 8.0 1st Qu.: 18454
## Median : 6057754 Mode :character Median : 28.0 Median : 41748
## Mean : 5222065 Mean : 539.7 Mean : 45570
## 3rd Qu.: 6262024 3rd Qu.: 135.0 3rd Qu.: 75475
## Max. :61532443 Max. :9999.0 Max. :107520
## NA's :23682
Part Relationships
| P |
3626cpr3662 |
3626c |
| P |
87079pr9974 |
87079 |
| P |
3960pr9971 |
3960 |
| R |
98653pr0003 |
98086pr0003 |
| R |
98653pr0003 |
98088pat0003 |
| R |
98653pr0003 |
98089pat0003 |
## rel_type child_part_num parent_part_num
## Length:29977 Length:29977 Length:29977
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
Part categories
| 1 |
Baseplates |
| 3 |
Bricks Sloped |
| 4 |
Duplo, Quatro and Primo |
| 5 |
Bricks Special |
| 6 |
Bricks Wedged |
| 7 |
Containers |
## id name
## Min. : 1.00 Length:66
## 1st Qu.:19.25 Class :character
## Median :35.50 Mode :character
## Mean :35.36
## 3rd Qu.:51.75
## Max. :68.00
Dataframes size
All rows in dataframes: 1446639
All columns in dataframes: 45
All values in dataframes: 8099232
Analysis deep dive
Themes
Themes sum of rows with NA value
## [1] 145
All NA values are in parent_id column
## id name parent_id
## 1 0 0 145
Create themes_parent_df
| 1 |
1 |
Technic |
| 10 |
22 |
Creator |
| 12 |
34 |
Make & Create |
| 14 |
50 |
Town |
| 16 |
52 |
City |
| 43 |
112 |
Racers |
Join themes_parent_df with themes_df, insert name_child into
name_parent if name_parent is NA
| 1 |
Technic |
Technic |
| 3 |
Competition |
Technic |
| 4 |
Expert Builder |
Technic |
| 16 |
RoboRiders |
Technic |
| 17 |
Speed Slammers |
Technic |
| 18 |
Star Wars |
Technic |
Sets
Sets sum of rows with NA value
## [1] 0
Delete 2024 and 2023 from dataframe
Join sets with prepared themes
156 Parent themes with number of sets and sum of parts
| 4 Juniors |
54 |
2317 |
| Adventurers |
83 |
8458 |
| Agents |
28 |
11970 |
| Alpha Team |
31 |
4313 |
| Angry Birds |
6 |
2388 |
| Aquazone |
37 |
6577 |
Top 10 themes based on number of sets
## # A tibble: 15 × 3
## name_parent count_sets sum_parts
## <chr> <int> <int>
## 1 Gear 3144 8630
## 2 Duplo 1269 44002
## 3 Star Wars 902 324609
## 4 Books 869 7466
## 5 City 812 166527
## 6 Collectible Minifigures 803 5661
## 7 Technic 786 255869
## 8 Town 677 84587
## 9 Educational and Dacta 672 127642
## 10 Friends 514 101468
## 11 Service Packs 501 7060
## 12 Ninjago 490 122719
## 13 Creator 489 199572
## 14 System 464 21618
## 15 Bionicle 460 32668
## # A tibble: 428 × 4
## year name_parent count_sets sum_parts
## <int> <chr> <int> <int>
## 1 1949 System 5 498
## 2 1950 System 6 6
## 3 1953 System 4 50
## 4 1954 System 14 117
## 5 1955 Books 1 0
## 6 1955 System 35 439
## 7 1956 Books 1 0
## 8 1956 System 17 295
## 9 1957 System 20 783
## 10 1958 System 57 992
## # ℹ 418 more rows
Parts
Parts sum of rows with NA value
## [1] 0
### Part categories #### Part categories sum of rows with NA value
## [1] 0
Join parts with part_categories
| 003381 |
58 |
Plastic |
Stickers |
| 003383 |
58 |
Plastic |
Stickers |
| 003402 |
58 |
Plastic |
Stickers |
| 003429 |
58 |
Plastic |
Stickers |
| 003432 |
58 |
Plastic |
Stickers |
| 003434 |
58 |
Plastic |
Stickers |
## # A tibble: 116 × 3
## # Groups: name_categories [66]
## name_categories part_material count
## <chr> <chr> <int>
## 1 Bars, Ladders and Fences Plastic 127
## 2 Baseplates Cardboard/Paper 16
## 3 Baseplates Plastic 223
## 4 Belville, Scala and Fabuland Cloth 245
## 5 Belville, Scala and Fabuland Foam 17
## 6 Belville, Scala and Fabuland Plastic 431
## 7 Belville, Scala and Fabuland Rubber 1
## 8 Bricks Plastic 1555
## 9 Bricks Curved Plastic 592
## 10 Bricks Round and Cones Plastic 445
## # ℹ 106 more rows
Part relationships
Part rategories sum of rows with NA value
## [1] 23682
All NA values are in design_id column
## element_id part_num color_id design_id
## 1 0 0 0 23682
I don’t know what design_id reffers to (there is no table with
design_id key) so I’m going to remove design_id column
Join parts_with_categories with elements
| 003381 |
58 |
Plastic |
Stickers |
NA |
NA |
| 003383 |
58 |
Plastic |
Stickers |
NA |
NA |
| 003402 |
58 |
Plastic |
Stickers |
NA |
NA |
| 003429 |
58 |
Plastic |
Stickers |
NA |
NA |
| 003432 |
58 |
Plastic |
Stickers |
NA |
NA |
| 003434 |
58 |
Plastic |
Stickers |
NA |
NA |
Parts without any elements
| 003381 |
58 |
Plastic |
Stickers |
| 003383 |
58 |
Plastic |
Stickers |
| 003402 |
58 |
Plastic |
Stickers |
| 003429 |
58 |
Plastic |
Stickers |
| 003432 |
58 |
Plastic |
Stickers |
| 003434 |
58 |
Plastic |
Stickers |
Parts with any elements
| 68 |
01023 |
58 |
Plastic |
Stickers |
9999 |
| 69 |
01041 |
58 |
Plastic |
Stickers |
9999 |
| 70 |
01343 |
58 |
Plastic |
Stickers |
9999 |
| 71 |
01500 |
58 |
Plastic |
Stickers |
9999 |
| 72 |
01501 |
58 |
Plastic |
Stickers |
9999 |
| 73 |
01503 |
58 |
Plastic |
Stickers |
9999 |

Categories of parts without any elements
## # A tibble: 104 × 3
## # Groups: name_categories [65]
## name_categories part_material count
## <chr> <chr> <int>
## 1 Bars, Ladders and Fences Plastic 23
## 2 Baseplates Cardboard/Paper 6
## 3 Baseplates Plastic 191
## 4 Belville, Scala and Fabuland Cloth 213
## 5 Belville, Scala and Fabuland Foam 17
## 6 Belville, Scala and Fabuland Plastic 270
## 7 Belville, Scala and Fabuland Rubber 1
## 8 Bricks Plastic 1026
## 9 Bricks Curved Plastic 141
## 10 Bricks Round and Cones Plastic 66
## # ℹ 94 more rows
Categories of parts with any elements
## # A tibble: 95 × 3
## # Groups: name_categories [64]
## name_categories part_material count
## <chr> <chr> <int>
## 1 Bars, Ladders and Fences Plastic 1192
## 2 Baseplates Cardboard/Paper 10
## 3 Baseplates Plastic 182
## 4 Belville, Scala and Fabuland Cloth 34
## 5 Belville, Scala and Fabuland Plastic 578
## 6 Bricks Plastic 2551
## 7 Bricks Curved Plastic 2768
## 8 Bricks Round and Cones Plastic 1855
## 9 Bricks Sloped Plastic 2402
## 10 Bricks Special Plastic 1250
## # ℹ 85 more rows
Colors
Colors sum of rows with NA value
## [1] 0
Join parts_with_categories_and_elements with colors
| 003381 |
58 |
Plastic |
Stickers |
NA |
NA |
NA |
NA |
NA |
| 003383 |
58 |
Plastic |
Stickers |
NA |
NA |
NA |
NA |
NA |
| 003402 |
58 |
Plastic |
Stickers |
NA |
NA |
NA |
NA |
NA |
| 003429 |
58 |
Plastic |
Stickers |
NA |
NA |
NA |
NA |
NA |
| 003432 |
58 |
Plastic |
Stickers |
NA |
NA |
NA |
NA |
NA |
| 003434 |
58 |
Plastic |
Stickers |
NA |
NA |
NA |
NA |
NA |

## # A tibble: 181 × 3
## # Groups: name [181]
## name rgb count_color
## <chr> <chr> <int>
## 1 Aqua B3D7D1 63
## 2 Black 05131D 7803
## 3 Blue 0055BF 2869
## 4 Bright Green 4B9F4A 704
## 5 Bright Light Blue 9FC3E9 514
## 6 Bright Light Orange F8BB3D 1037
## 7 Bright Light Yellow FFF03A 528
## 8 Bright Pink E4ADC8 602
## 9 Brown 583927 818
## 10 Chrome Antique Brass 645A4C 1
## # ℹ 171 more rows
Most popular colors
## # A tibble: 22 × 4
## # Groups: name [22]
## name rgb count_color rgb_fill
## <chr> <chr> <int> <chr>
## 1 Black 05131D 7803 #05131D
## 2 Blue 0055BF 2869 #0055BF
## 3 Bright Light Orange F8BB3D 1037 #F8BB3D
## 4 Dark Blue 0A3463 1819 #0A3463
## 5 Dark Bluish Gray 6C6E68 2939 #6C6E68
## 6 Dark Gray 6D6E5C 1018 #6D6E5C
## 7 Dark Red 720E0F 1673 #720E0F
## 8 Dark Tan 958A73 1059 #958A73
## 9 Green 237841 1995 #237841
## 10 Light Bluish Gray A0A5A9 3321 #A0A5A9
## # ℹ 12 more rows